Show the code
import pandas as pd
import polars as pl
import numpy as np
from lets_plot import *
# add the additional libraries you need to import for ML here
LetsPlot.setup_html(isolated_frame=True)import pandas as pd
import polars as pl
import numpy as np
from lets_plot import *
# add the additional libraries you need to import for ML here
LetsPlot.setup_html(isolated_frame=True)# import your data here using pandas and the URL
url = "https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"
df = pl.read_csv("StarWars.csv")Shorten the column names and clean them up for easier use with pandas. Provide a table or list that exemplifies how you fixed the names.
Many of the names were manually renamed, other names were taken from the column details and adapted to lower case and had the spaces replaced with underscores.
# Include and execute your code here
# Fix UTF encoding
# Create a readme/data.md using the og column names
df_clean = df.rename({
df.columns[1]: "seen",
df.columns[2]: "fan",
df.columns[3]: "seen_epi_i",
df.columns[4]: "seen_epi_ii",
df.columns[5]: "seen_epi_iii",
df.columns[6]: "seen_epi_iv",
df.columns[7]: "seen_epi_v",
df.columns[8]: "seen_epi_vi",
df.columns[9]: "rank_epi_i",
df.columns[10]: "rank_epi_ii",
df.columns[11]: "rank_epi_iii",
df.columns[12]: "rank_epi_iv",
df.columns[13]: "rank_epi_v",
df.columns[14]: "rank_epi_vi",
df.columns[15]: df[df.columns[15]][0].lower().replace(' ', '_'),
df.columns[16]: df[df.columns[16]][0].lower().replace(' ', '_'),
df.columns[17]: df[df.columns[17]][0].lower().replace(' ', '_'),
df.columns[18]: df[df.columns[18]][0].lower().replace(' ', '_'),
df.columns[19]: df[df.columns[19]][0].lower().replace(' ', '_'),
df.columns[20]: df[df.columns[20]][0].lower().replace(' ', '_'),
df.columns[21]: df[df.columns[21]][0].lower().replace(' ', '_'),
df.columns[22]: df[df.columns[22]][0].lower().replace(' ', '_'),
df.columns[23]: df[df.columns[23]][0].lower().replace(' ', '_'),
df.columns[24]: df[df.columns[24]][0].lower().replace(' ', '_'),
df.columns[25]: df[df.columns[25]][0].lower().replace(' ', '_'),
df.columns[26]: df[df.columns[26]][0].lower().replace(' ', '_'),
df.columns[27]: df[df.columns[27]][0].lower().replace(' ', '_'),
df.columns[28]: df[df.columns[28]][0].lower().replace(' ', '_'),
df.columns[29]: "shot_first",
df.columns[30]: "ex_uni",
df.columns[31]: "fan_ex_uni",
df.columns[32]: "fan_star_trek",
df.columns[33]: df.columns[33].lower().replace(' ', '_'),
df.columns[34]: df.columns[34].lower().replace(' ', '_'),
df.columns[35]: df.columns[35].lower().replace(' ', '_'),
df.columns[36]: df.columns[36].lower().replace(' ', '_'),
df.columns[37]: "location"
})
df_clean = df_clean[1:]
display(df_clean.head())| RespondentID | seen | fan | seen_epi_i | seen_epi_ii | seen_epi_iii | seen_epi_iv | seen_epi_v | seen_epi_vi | rank_epi_i | rank_epi_ii | rank_epi_iii | rank_epi_iv | rank_epi_v | rank_epi_vi | han_solo | luke_skywalker | princess_leia_organa | anakin_skywalker | obi_wan_kenobi | emperor_palpatine | darth_vader | lando_calrissian | boba_fett | c-3p0 | r2_d2 | jar_jar_binks | padme_amidala | yoda | shot_first | ex_uni | fan_ex_uni | fan_star_trek | gender | age | household_income | education | location |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| 3292879998 | "Yes" | "Yes" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "3" | "2" | "1" | "4" | "5" | "6" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "I don't understand this questi… | "Yes" | "No" | "No" | "Male" | "18-29" | null | "High school degree" | "South Atlantic" |
| 3292879538 | "No" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | "Yes" | "Male" | "18-29" | "$0 - $24,999" | "Bachelor degree" | "West South Central" |
| 3292765271 | "Yes" | "No" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | null | null | null | "1" | "2" | "3" | "4" | "5" | "6" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "I don't understand this questi… | "No" | null | "No" | "Male" | "18-29" | "$0 - $24,999" | "High school degree" | "West North Central" |
| 3292763116 | "Yes" | "Yes" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "5" | "6" | "1" | "2" | "4" | "3" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Somewhat favorably" | "Very favorably" | "Somewhat favorably" | "Somewhat unfavorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "I don't understand this questi… | "No" | null | "Yes" | "Male" | "18-29" | "$100,000 - $149,999" | "Some college or Associate degr… | "West North Central" |
| 3292731220 | "Yes" | "Yes" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "5" | "4" | "6" | "2" | "1" | "3" | "Very favorably" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat unfavorably" | "Very favorably" | "Very unfavorably" | "Somewhat favorably" | "Neither favorably nor unfavora… | "Very favorably" | "Somewhat favorably" | "Somewhat favorably" | "Very unfavorably" | "Somewhat favorably" | "Somewhat favorably" | "Greedo" | "Yes" | "No" | "No" | "Male" | "18-29" | "$100,000 - $149,999" | "Some college or Associate degr… | "West North Central" |
Filter the dataset to 835 respondents that have seen at least one film (Hint: Don’t use the column Have you seen any of the 6 films in the Star Wars franchise?) Not much to show here by way of output. Print the shape (i.e. number of rows and number of columns) of the filtered dataset.
Gathering the shape of the seen movies, the count is 835.
# Include and execute your code here
seen_cols = ["seen_epi_i", "seen_epi_ii", "seen_epi_iii", "seen_epi_iv", "seen_epi_v", "seen_epi_vi",]
df_seen = df_clean.filter(
(pl.col(df_clean.columns[3]).is_not_null())
| (pl.col(df_clean.columns[4]).is_not_null())
| (pl.col(df_clean.columns[5]).is_not_null())
| (pl.col(df_clean.columns[6]).is_not_null())
| (pl.col(df_clean.columns[7]).is_not_null())
| (pl.col(df_clean.columns[8]).is_not_null())
).select(seen_cols)
movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", "The Empire Strikes Back", "Return of the Jedi"]
seen_counts = [df_seen[col].count() for col in seen_cols]
seen_percs = [i / df_seen.shape[0] for i in seen_counts]
df_percs = pl.DataFrame({"movie": movies[::-1], "percentage": seen_percs[::-1]})
df_percs = df_percs.with_columns(
((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
)
rank_cols = ["rank_epi_i", "rank_epi_ii", "rank_epi_iii", "rank_epi_iv", "rank_epi_v", "rank_epi_vi",]
df_rank = df_clean.filter(
(pl.col(df_clean.columns[3]).is_not_null())
& (pl.col(df_clean.columns[4]).is_not_null())
& (pl.col(df_clean.columns[5]).is_not_null())
& (pl.col(df_clean.columns[6]).is_not_null())
& (pl.col(df_clean.columns[7]).is_not_null())
& (pl.col(df_clean.columns[8]).is_not_null())
).select(rank_cols)
# Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6)
df_rank = df_rank.with_columns(pl.col('rank_epi_iii').fill_null("6"))
rank_counts = [df_rank.filter(pl.col(i) == "1").height for i in rank_cols]
# .height = .shape[0] in polars
rank_percs = [i / df_rank.height for i in rank_counts]
df_rank_percs = pl.DataFrame({"movie": movies[::-1], "percentage": rank_percs[::-1]})
df_rank_percs = df_rank_percs.with_columns(
((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
)
print(f"The number of respondents that have seen at least one movie: {df_seen.height}")The number of respondents that have seen at least one movie: 835
Validate that the data provided on GitHub lines up with the article by recreating 2 of the visuals from the article. These visuals should be similar, but don’t have to be exact. They need to be close enough that we can validate that the values in the dataset match the graphs in the chart. Though their charts were built using a different plotting software, the more you push yourself for an exact replica, the more you will learn. Spend at least a couple of hours on this.
The Empire Strikes Back is the most seen film of the saga. It also happens to be the favorite.
# Include and execute your code here
movie_graph = (
ggplot(data=df_percs)
+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
+ labs(
title="Which 'Star Wars' Movie's Have You Seen?",
subtitle="Of 835 respondents who have seen any film",
x='',
y=''
)
+ scale_x_continuous(limits=[0,1])
+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
+ theme(
panel_background=element_rect(fill='gray', linetype=0),
plot_background=element_rect(fill='gray'),
# panel_grid_major=element_rect(fill='gray'),
panel_grid=element_blank(),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='black', size=18),
# axis_title=element_text(color='white'),
plot_title=element_text(color='black', face="bold", hjust=0, size=25),
plot_subtitle=element_text(color='black', hjust=0, size=20),
legend_text=element_text(color='white'),
legend_title=element_text(color='white'),
label_text=element_text(color='white'),
axis_line_x=element_blank(),
axis_ticks_x=element_blank(),
axis_text_x=element_blank(),
plot_title_position='plot'
)
+ ggsize(1600, 900)
)
display(movie_graph)rank_graph = (
ggplot(data=df_rank_percs)
+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
+ labs(
title="What's the Best 'Star Wars' Movie?",
subtitle="Of 741 respondents who have seen all 6 films",
x='',
y=''
)
+ scale_x_continuous(limits=[0,0.5])
+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
+ theme(
panel_background=element_rect(fill='gray', linetype=0),
plot_background=element_rect(fill='gray'),
panel_grid=element_blank(),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='black', size=18),
plot_title=element_text(color='black', face="bold", hjust=0, size=25),
plot_subtitle=element_text(color='black', hjust=0, size=20),
legend_text=element_text(color='white'),
legend_title=element_text(color='white'),
label_text=element_text(color='white'),
axis_line_x=element_blank(),
axis_ticks_x=element_blank(),
axis_text_x=element_blank(),
plot_title_position='plot'
)
+ ggsize(1600, 900)
)
display(rank_graph)